{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 特征工程 tf-idf"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.feature_extraction.text import TfidfTransformer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pregnants</th>\n",
       "      <th>Plasma_glucose_concentration</th>\n",
       "      <th>blood_pressure</th>\n",
       "      <th>Triceps_skin_fold_thickness</th>\n",
       "      <th>serum_insulin</th>\n",
       "      <th>BMI</th>\n",
       "      <th>Diabetes_pedigree_function</th>\n",
       "      <th>Age</th>\n",
       "      <th>Target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>6</td>\n",
       "      <td>148</td>\n",
       "      <td>72</td>\n",
       "      <td>35</td>\n",
       "      <td>0</td>\n",
       "      <td>33.6</td>\n",
       "      <td>0.627</td>\n",
       "      <td>50</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>85</td>\n",
       "      <td>66</td>\n",
       "      <td>29</td>\n",
       "      <td>0</td>\n",
       "      <td>26.6</td>\n",
       "      <td>0.351</td>\n",
       "      <td>31</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>8</td>\n",
       "      <td>183</td>\n",
       "      <td>64</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>23.3</td>\n",
       "      <td>0.672</td>\n",
       "      <td>32</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>89</td>\n",
       "      <td>66</td>\n",
       "      <td>23</td>\n",
       "      <td>94</td>\n",
       "      <td>28.1</td>\n",
       "      <td>0.167</td>\n",
       "      <td>21</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>137</td>\n",
       "      <td>40</td>\n",
       "      <td>35</td>\n",
       "      <td>168</td>\n",
       "      <td>43.1</td>\n",
       "      <td>2.288</td>\n",
       "      <td>33</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pregnants  Plasma_glucose_concentration  blood_pressure  \\\n",
       "0          6                           148              72   \n",
       "1          1                            85              66   \n",
       "2          8                           183              64   \n",
       "3          1                            89              66   \n",
       "4          0                           137              40   \n",
       "\n",
       "   Triceps_skin_fold_thickness  serum_insulin   BMI  \\\n",
       "0                           35              0  33.6   \n",
       "1                           29              0  26.6   \n",
       "2                            0              0  23.3   \n",
       "3                           23             94  28.1   \n",
       "4                           35            168  43.1   \n",
       "\n",
       "   Diabetes_pedigree_function  Age  Target  \n",
       "0                       0.627   50       1  \n",
       "1                       0.351   31       0  \n",
       "2                       0.672   32       1  \n",
       "3                       0.167   21       0  \n",
       "4                       2.288   33       1  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train = pd.read_csv(r\"C:\\Users\\yun\\Desktop\\pima-indians-diabetes.csv\")\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_train = train['Target']\n",
    "X_train = train.drop('Target',axis=1)\n",
    "column_org = X_train.columns\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>pregnants_tfidf</th>\n",
       "      <th>Plasma_glucose_concentration_tfidf</th>\n",
       "      <th>blood_pressure_tfidf</th>\n",
       "      <th>Triceps_skin_fold_thickness_tfidf</th>\n",
       "      <th>serum_insulin_tfidf</th>\n",
       "      <th>BMI_tfidf</th>\n",
       "      <th>Diabetes_pedigree_function_tfidf</th>\n",
       "      <th>Age_tfidf</th>\n",
       "      <th>Target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.037717</td>\n",
       "      <td>0.810132</td>\n",
       "      <td>0.409804</td>\n",
       "      <td>0.256931</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.185363</td>\n",
       "      <td>0.003410</td>\n",
       "      <td>0.271919</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.009341</td>\n",
       "      <td>0.691357</td>\n",
       "      <td>0.558183</td>\n",
       "      <td>0.316326</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.218049</td>\n",
       "      <td>0.002836</td>\n",
       "      <td>0.250508</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.046188</td>\n",
       "      <td>0.920021</td>\n",
       "      <td>0.334562</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.118057</td>\n",
       "      <td>0.003357</td>\n",
       "      <td>0.159835</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.005813</td>\n",
       "      <td>0.450469</td>\n",
       "      <td>0.347351</td>\n",
       "      <td>0.156119</td>\n",
       "      <td>0.787603</td>\n",
       "      <td>0.143341</td>\n",
       "      <td>0.000840</td>\n",
       "      <td>0.105602</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.426849</td>\n",
       "      <td>0.129587</td>\n",
       "      <td>0.146243</td>\n",
       "      <td>0.866498</td>\n",
       "      <td>0.135338</td>\n",
       "      <td>0.007082</td>\n",
       "      <td>0.102151</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   pregnants_tfidf  Plasma_glucose_concentration_tfidf  blood_pressure_tfidf  \\\n",
       "0         0.037717                            0.810132              0.409804   \n",
       "1         0.009341                            0.691357              0.558183   \n",
       "2         0.046188                            0.920021              0.334562   \n",
       "3         0.005813                            0.450469              0.347351   \n",
       "4         0.000000                            0.426849              0.129587   \n",
       "\n",
       "   Triceps_skin_fold_thickness_tfidf  serum_insulin_tfidf  BMI_tfidf  \\\n",
       "0                           0.256931             0.000000   0.185363   \n",
       "1                           0.316326             0.000000   0.218049   \n",
       "2                           0.000000             0.000000   0.118057   \n",
       "3                           0.156119             0.787603   0.143341   \n",
       "4                           0.146243             0.866498   0.135338   \n",
       "\n",
       "   Diabetes_pedigree_function_tfidf  Age_tfidf  Target  \n",
       "0                          0.003410   0.271919       1  \n",
       "1                          0.002836   0.250508       0  \n",
       "2                          0.003357   0.159835       1  \n",
       "3                          0.000840   0.105602       0  \n",
       "4                          0.007082   0.102151       1  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tfidf = TfidfTransformer()\n",
    "X_train_tfidf = tfidf.fit_transform(X_train).toarray()\n",
    "feat_names = column_org + \"_tfidf\"\n",
    "X_train_tfidf=pd.DataFrame(columns=feat_names,data=X_train_tfidf)\n",
    "train = pd.concat([X_train_tfidf, y_train], axis = 1)\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "train.to_csv('Tfidf_pima-indians-diabetes.csv',index = False,header=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
